---
title: "NCD Baseline Analysis"
author: "Ben Mosong"
date: "`r Sys.Date()`"
output:
  html_document:
    df_print: paged
  pdf_document: default
  word_document: default
---

```{r setup, include=TRUE}
knitr::opts_chunk$set(
  echo = T,
  comment = NA,
  message = FALSE,
  warning = FALSE
)
pacman::p_load(tidyverse,
               gtsummary,
               finalfit,
               prevalence,
               janitor,
               eeptools,
               gt)

df.7 <- read.csv("./DataSet7_8Nov23.csv",
                 header = T,
                 na.strings = c("", "NA"))
df.survey <- read.csv(
  "./HarambeeSurveyAssess_DATA_2024-02-23_1303.csv",
  header = T,
  na.strings = c("", "NA")
)
theme_set(theme_bw())
```

```{r df_ncd}
## Load the data

# Sample month 0,1,2 and 3 data for both Arm A and B participants
df.ncd.7 <- df.7 %>%
  filter((visit_month %in% c('month 0', 'month 1', 'month 2', 'month 3')), arm %in% c(1, 2)) %>%
  remove_empty(c("cols", "rows")) %>%
  select(
    participant_id,
    arm,
    age,
    gender,
    visit_month,
    marital_status,
    marital_status_specify,
    weight,
    height,
    hba1c_results,
    schooling_achieved,
    rbs_result,
    fbs_result,
    known_dm_or_htn,
    specific_ncd_case,
    est_monthly_income,
    systolic_bp = sys_bp,
    diastolic_bp = dys_bp,
    wealthscore,
    date_aim1enroll,
    date_first_join_group,
    ends_with("_summary")
  ) %>%
  select(-starts_with("doseadhere"))


df.months_in_gishe <- df.ncd.7 %>%
  select(participant_id, date_aim1enroll, date_first_join_group) %>%
  mutate(
    date_first_join_group = ifelse((
      as.Date(date_aim1enroll, format = "%d/%m/%Y") <= as.Date(date_first_join_group, format =
                                                                 "%d/%m/%Y")
    ), NA, date_first_join_group),
    date_aim1enroll = ifelse(is.na(date_first_join_group), NA, date_aim1enroll)
  ) %>% filter(!is.na(date_first_join_group)) %>%
  mutate(months_in_gishe = round(age_calc(
    as.Date(date_first_join_group, format = "%d/%m/%Y"),
    as.Date(date_aim1enroll, format = "%d/%m/%Y"),
    units = "months"
  ), 0)) %>%
  select(participant_id, months_in_gishe)

df.ncd.7 <- df.ncd.7 %>% left_join(df.months_in_gishe, by = "participant_id")

df.ncd.m0 <- df.ncd.7 %>%
  filter(visit_month == "month 0") %>%
  remove_empty(c("cols")) %>%
  select(-c(2:5)) %>%
  mutate(
    screened_htn_yes_m0 = ifelse((systolic_bp >= 140 |
                                    diastolic_bp >= 90) &
                                   (is.na(specific_ncd_case) | specific_ncd_case == 1),
                                 1,
                                 0
    ),
    screened_dm_yes_m0 = case_when(
      rbs_result >= 7.8 &
        hba1c_results >= 6.5 &
        (is.na(specific_ncd_case) | specific_ncd_case == 2) ~ 1,
      .default = 0
    )
  )

df.ncd.m1 <- df.ncd.7 %>%
  filter(visit_month == "month 1") %>%
  remove_empty(c("cols")) %>% select(-c(2:5)) %>%
  mutate(
    screened_htn_yes_m1 = ifelse((systolic_bp >= 140 |
                                    diastolic_bp >= 90), 1, 0),
    screened_dm_yes_m1 = case_when(rbs_result >= 7.8 &
                                     hba1c_results >= 6.5 ~ 1, .default = 0)
  )
df.ncd.m3 <- df.ncd.7 %>%
  filter(visit_month == "month 3") %>%
  remove_empty(c("cols")) %>%
  select(-c(2:5)) %>%
  mutate(
    screened_htn_yes_m3 = ifelse((systolic_bp >= 140 |
                                    diastolic_bp >= 90) &
                                   (is.na(specific_ncd_case) | specific_ncd_case == 1),
                                 1,
                                 0
    ),
    screened_dm_yes_m3 = case_when(rbs_result >= 7.8 &
                                     (
                                       is.na(specific_ncd_case) | specific_ncd_case == 2
                                     ) ~ 0, .default = 0)
  )


df.ncd <- df.ncd.7[, c(1:4)] %>%
  distinct(participant_id, .keep_all = T) %>%
  left_join(df.ncd.m0, by = "participant_id") %>%
  left_join(df.ncd.m1, by = "participant_id") %>%
  mutate(
    weight = coalesce(weight.y, weight.x),
    height = coalesce(height.y, height.x),
    hba1c_results = coalesce(hba1c_results.y, hba1c_results.x),
    rbs_result = coalesce(rbs_result.y, rbs_result.x),
    known_dm_or_htn = coalesce(known_dm_or_htn.y, known_dm_or_htn.x),
    months_in_gishe = coalesce(months_in_gishe.x, months_in_gishe.y),
    date_aim1enroll = coalesce(date_aim1enroll.x, date_aim1enroll.y),
    # keep the bp that resulted in a positive case
    systolic_bp = case_when(
      screened_htn_yes_m1 == 1 ~ coalesce(systolic_bp.y, systolic_bp.x),
      .default = coalesce(systolic_bp.x, systolic_bp.y)
    ),
    diastolic_bp = case_when(
      screened_htn_yes_m1 == 1 ~ coalesce(diastolic_bp.y, diastolic_bp.x),
      .default = coalesce(diastolic_bp.x, diastolic_bp.y)
    )
  ) %>%
  select(-ends_with(c(".x", ".y"))) %>%
  left_join(df.ncd.m3, by = "participant_id") %>%
  mutate(
    weight = coalesce(weight.y, weight.x),
    height = coalesce(height.y, height.x),
    rbs_result = case_when(
      screened_dm_yes_m1 == 1 ~ coalesce(rbs_result.y, rbs_result.x),
      .default = coalesce(rbs_result.y, rbs_result.x)
    ),
    months_in_gishe = coalesce(months_in_gishe.x, months_in_gishe.y),
    date_aim1enroll = coalesce(date_aim1enroll.x, date_aim1enroll.y),
    known_dm_or_htn = coalesce(known_dm_or_htn.y, known_dm_or_htn.x),
    specific_ncd_case = coalesce(specific_ncd_case.y, specific_ncd_case.x),
    # keep the bp that resulted in a positive case
    systolic_bp = case_when(
      screened_htn_yes_m1 == 1 ~ coalesce(systolic_bp.y, systolic_bp.x),
      .default = coalesce(systolic_bp.x, systolic_bp.y)
    ),
    diastolic_bp = case_when(
      screened_htn_yes_m1 == 1 ~ coalesce(diastolic_bp.y, diastolic_bp.x),
      .default = coalesce(diastolic_bp.x, diastolic_bp.y)
    )
  ) %>%
  select(-ends_with(c(".x", ".y")))
  
```
\newline
\newline
```{r ncd}

# code the outcome variables
df.ncd <- df.ncd %>%
  mutate(
    htn_confirmed = case_when(
      specific_ncd_case %in% c(2, 3) ~ 1,
      (screened_htn_yes_m0 == 1 & screened_htn_yes_m1) == 1 ~ 1,
      (screened_htn_yes_m0 == 1 & screened_htn_yes_m3) == 1 ~ 1,
      .default = 0
    ),
    dm_confirmed = case_when(
      specific_ncd_case %in% c(1, 3) ~ 1,
      (screened_dm_yes_m0 == 1 & screened_dm_yes_m1) == 1 ~ 1,
      (screened_dm_yes_m0 == 1 & screened_dm_yes_m3) == 1 ~ 1,
      .default = 0
    ),
    newly_confirmed_htn = case_when(
      (
        screened_htn_yes_m0 == 1 &
          (is.na(screened_htn_yes_m1) |
             screened_htn_yes_m1 == 0) &
          (is.na(screened_htn_yes_m3) | screened_htn_yes_m3 == 0)
      ) ~ 1,
      ((is.na(screened_htn_yes_m0) |
          screened_htn_yes_m0 == 0) &
         screened_htn_yes_m1 == 1 &
         (is.na(screened_htn_yes_m3) | screened_htn_yes_m3 == 0)
      ) ~ 1,
      ((is.na(screened_htn_yes_m0) |
          screened_htn_yes_m0 == 0) &
         (is.na(screened_htn_yes_m1) |
            screened_htn_yes_m1 == 0) & screened_htn_yes_m3 == 1
      ) ~ 1,
      .default = 0
    ),
    newly_confirmed_dm = case_when(
      (
        screened_dm_yes_m0 == 1 &
          (is.na(screened_dm_yes_m1) |
             screened_dm_yes_m1 == 0) &
          (is.na(screened_dm_yes_m3) | screened_dm_yes_m3 == 0)
      ) ~ 1,
      ((is.na(screened_dm_yes_m0) |
          screened_dm_yes_m0 == 0) &
         screened_dm_yes_m1 == 1 &
         (is.na(screened_dm_yes_m3) | screened_dm_yes_m3 == 0)
      ) ~ 1,
      ((is.na(screened_dm_yes_m0) |
          screened_dm_yes_m0 == 0) &
         (is.na(screened_dm_yes_m1) |
            screened_dm_yes_m1 == 0) & screened_dm_yes_m3 == 1
      ) ~ 1,
      .default = 0
    )
  ) %>%
  # screened/confirmed positive for HTN
  mutate(
    htn_yes = ifelse(htn_confirmed == 1 |
                       newly_confirmed_htn == 1 , 1, 0),
    dm_yes = ifelse(dm_confirmed == 1 |
                      newly_confirmed_dm == 1 , 1, 0)
  )

# swap weight and height in some cases.
ht_HB1110 <- df.ncd$height[df.ncd$participant_id == 'HB1110']
df.ncd$height[df.ncd$participant_id == 'HB1110'] <- df.ncd$weight[df.ncd$participant_id ==
                                                                    'HB1110']
df.ncd$weight[df.ncd$participant_id == 'HB1110'] <- ht_HB1110

ht_HB1526 <- df.ncd$height[df.ncd$participant_id == 'HB1526']
df.ncd$height[df.ncd$participant_id == 'HB1526'] <- df.ncd$weight[df.ncd$participant_id ==
                                                                    'HB1526']
df.ncd$weight[df.ncd$participant_id == 'HB1526'] <- ht_HB1526
# 35.8cm is a very small height in both inch and cm. Just replace it with the median of its neighborhood using weight as the param
df.ncd$height[df.ncd$participant_id == 'HT2208'] <- median(df.ncd[which(between(df.ncd$weight, 75, 77) &
                                                                          df.ncd$participant_id != 'HT2208'), ]$height)

# unreasonable height of 1156. I guess it should be 156?
df.ncd$height[df.ncd$participant_id == 'HB1120'] <- 156
```
\newline
\newline
```{r labelling}

# create wealth index that is representative of the sample
df.ncd$wealthindex <- statar::xtile(df.ncd$wealthscore, 5)

df.sample <- df.ncd %>%
  select(
    -c(
      participant_id,
      screened_htn_yes_m0,
      screened_htn_yes_m1,
      screened_htn_yes_m3,
      screened_dm_yes_m0,
      screened_dm_yes_m1,
      screened_dm_yes_m3,
      known_dm_or_htn,
      specific_ncd_case,
      marital_status_specify,
      wealthscore,
      date_aim1enroll,
      date_first_join_group
    ),-contains("confirmed")
  ) %>%
  mutate(
    age_cat = case_when(
      age < 35 ~ "18-34 (< 35)",
      between(age, 35, 44) ~ "35-44",
      between(age, 45, 54) ~ "45-54",
      between(age, 55, 64) ~ "55-64",
      age >=  65 ~ "≥ 65",
      .default = NA_character_
    ),
    age = factor(
      age_cat,
      levels = c("18-34 (< 35)", "35-44", "45-54", "55-64", "≥ 65"),
      labels = c("18-34 (< 35)", "35-44", "45-54", "55-64", "≥ 65")
    ) %>% ff_label("Age"),
    weight = as.numeric(weight) %>% ff_label("Weight"),
    # height is in cm
    height = as.numeric(height) %>% ff_label("Height"),
    arm = factor(
      arm,
      levels = c(1, 2),
      labels = c("Arm A (Microfinance + ICB)", "Arm B (Microfinance + SOC)")
    ) %>% ff_label("Arm"),
    bmi_calc = round(weight / (height / 100) ** 2, 1),
    bmi_cat = case_when(
      bmi_calc < 18.5 ~ 1,
      between(bmi_calc, 18.5, 24.9) ~ 2,
      between(bmi_calc, 25, 29.9) ~ 3,
      bmi_calc >= 30 ~ 4,
      .default = NA_integer_
    ),
    bmi = factor(
      bmi_cat,
      levels = c(1, 2, 3, 4),
      labels = c("Underweight", "Normal", "Overweight", "Obesity")
    ) %>% ff_label("BMI"),
    systolic_bp = as.numeric(systolic_bp) %>%  ff_label("Systolic BP"),
    diastolic_bp = as.numeric(diastolic_bp) %>%  ff_label("Diastolic BP"),
    rbs_result = as.numeric(rbs_result) %>%  ff_label("RBS Result"),
    hba1c_results = round(as.numeric(hba1c_results), 1) %>%  ff_label("HbA1C Result"),
    fbs_result = as.numeric(fbs_result) %>%  ff_label("FBS Result"),
    months_in_gishe = as.numeric(months_in_gishe) %>%  ff_label("Months in Gishe"),
    gender = factor(
      gender,
      levels = c(1, 2),
      labels = c("Male", "Female")
    ) %>% ff_label("Gender"),
    marital_status_cat = case_when(
      (marital_status %in% c(1, 3, 4)) ~ 1,
      (marital_status %in% c(2, 5)) ~ 2,
      .default = NA_real_
    ),
    marital_status = factor(
      marital_status_cat,
      levels = c(1, 2),
      labels = c("Cohabiting/Married", "Single/divorced/widowed/separated")
    ) %>% ff_label("Marital Status"),
    schooling_achieved = factor(
      schooling_achieved,
      levels = c(1, 2, 3, 4),
      labels = c("None", "Primary", "Secondary", "Tertiary")
    ) %>% ff_label("Highest Level of Schooling Achieved"),
    stigma_summary = factor(
      stigma_summary,
      levels = c(0, 1),
      labels = c("Low", "High")
    ) %>% ff_label("Felt Stigma Over Past 30 Days"),
    depression_summary = factor(
      depression_summary,
      levels = c(0, 1),
      labels = c("Screened Negative", "Screened Positive")
    ) %>% ff_label("Depressed Over Past 2 Weeks"),
    anxiety_summary = factor(
      anxiety_summary,
      levels = c(0, 1),
      labels = c("Screened Negative", "Screened Positive")
    ) %>% ff_label("Anxiety Over Past 2 Weeks"),
    phq4_summary = factor(
      phq4_summary,
      levels = c(0, 1, 2, 3),
      labels = c("None", "Mild", "Moderate", "Severe")
    ) %>% ff_label("Psychological Distress Over Past 2 Weeks"),
    wealthindex = factor(
      wealthindex,
      levels = c(1, 2, 3, 4, 5),
      labels = c(
        "Quintile 1 (Lowest)",
        "Quintile 2",
        "Quintile 3 (Middle)",
        "Quintile 4",
        "Quintile 5 (Highest)"
      )
    ) %>% ff_label("Wealth Index"),
    support_summary_cat = case_when(
      between(as.numeric(support_summary), 3, 8) ~ 1,
      between(as.numeric(support_summary), 9, 11) ~ 2,
      between(as.numeric(support_summary), 12, 14) ~ 3,
    ),
    support_summary = factor(
      support_summary_cat,
      levels = c(1, 2, 3),
      labels = c("Poor", "Moderate", "Strong")
    ) %>% ff_label("OSLO Social Support Scale"),
    hfais_cat = case_when(
      between(as.numeric(hfias_summary), 0, 1) ~ 1,
      between(as.numeric(hfias_summary), 2, 3) ~ 2,
      between(as.numeric(hfias_summary), 4, 6) ~ 3,
    ),
    hfais_summary = factor(
      hfais_cat,
      levels = c(1, 2, 3),
      labels = c("Little to None", "Moderate", "Severe")
    ) %>% ff_label("HFAIS Score"),
    est_monthly_income_cat = case_when(
      between(as.numeric(est_monthly_income), 98, 99) ~ 1,
      between(as.numeric(est_monthly_income), 1, 2) ~ 2,
      as.numeric(est_monthly_income) == 3 ~ 3,
      as.numeric(est_monthly_income) == 4 ~ 4,
      as.numeric(est_monthly_income) == 5 ~ 5,
      .default = NA_integer_
    ),
    est_monthly_income2_cat = case_when(
      as.numeric(est_monthly_income) == 6 ~ 1,
      between(as.numeric(est_monthly_income), 1, 2) ~ 2,
      as.numeric(est_monthly_income) == 3 ~ 3,
      as.numeric(est_monthly_income) == 4 ~ 4,
      as.numeric(est_monthly_income) == 5 ~ 5,
      as.numeric(est_monthly_income) == 98 ~ 6,
      .default = NA_integer_
    ),
    est_monthly_income = factor(
      est_monthly_income_cat,
      levels = c(1, 2, 3, 4, 5),
      labels = c(
        "Non-monetary gifts",
        "<3,000",
        "3,000 - 4,999",
        "5,000 - 9,999",
        ">=10,000"
      )
    )  %>% ff_label("Est. Monthly Income(KES)"),
    est_monthly_income_2 = factor(
      est_monthly_income2_cat,
      levels = c(1, 2, 3, 4, 5, 6),
      labels = c(
        "Non-monetary gifts",
        "<3,000",
        "3,000 - 4,999",
        "5,000 - 9,999",
        ">=10,000",
        "Could not Estimate"
      )
    )  %>% ff_label("Est. Monthly Income(KES) 2"),
    htn_yes = factor(
      htn_yes,
      levels = c(1, 0),
      labels = c("Positive", "Negative")
    ) %>%  ff_label("HTN Status"),
    dm_yes = factor(
      dm_yes,
      levels = c(1, 0),
      labels = c("Positive", "Negative")
    ) %>%  ff_label("DM Status"),
    ncd_cat = case_when(
      (htn_yes == "Positive" & dm_yes != "Positive") ~ 1,
      (dm_yes == "Positive" & htn_yes != "Positive") ~ 2,
      (dm_yes == "Positive" & htn_yes == "Positive") ~ 3,
      .default = 4,
    ),
    ncd_status = factor(
      ncd_cat,
      levels = c(1, 2, 3, 4),
      labels = c("HTN", "DM", "HTN & DM", "Normal")
    ) %>%  ff_label("NCD"),
    hiv_ncd_cat = case_when(ncd_cat %in% c(1, 2, 3) ~ 1, ncd_cat == 4 ~ 0, .default = 0, ),
    hiv_ncd_status = factor(
      hiv_ncd_cat,
      levels = c(0, 1),
      labels = c("HIV Only", "HIV & NCD")
    ) %>%  ff_label("HIV/NCD  Status"),
  ) %>% 
  # there might have been data entry errors in date_joined_gishe variable. Baseline requirement was 6months, enforce that.
  mutate(
    months_in_gishe = if_else(months_in_gishe <= 5, 6, months_in_gishe)
  ) %>%
  select(-ends_with("_cat"), -hfias_summary, -fbs_result)


# what are the stats for continuous body mass index

summary(df.sample$bmi_calc, na.rm = TRUE)
summary_stats <- df.sample %>%
  summarise(
    mean_bmi = mean(bmi_calc, na.rm = TRUE ),
    sd_bmi = sd(bmi_calc, na.rm = TRUE)
  )
print(summary_stats)


df.sample <- df.sample %>% select(-bmi_calc)

# export the sample dataset

write.csv(df.sample,
          paste0("./HTN_DM_Analytic_Data_", Sys.Date(), ".csv"),
          row.names = F)


  

```
\newline
\newline
```{r to_delete}

# try to visualiza the distribution of months in gishe before and after transformation
months <- na.omit(df.sample$months_in_gishe)

months_log <- log(months)
hist(months, xlab ="Months", main = "Months in Gishe before transformation")
hist(months_log, xlab ="Months",  main = "Months in Gishe after natural log transformation")
```

\newline
\newline
```{r prevalence_overall}

# summarize prevalence
df.sample %>% 
  select(ncd_status) %>%
  tbl_summary(statistic = list(all_categorical() ~ "{n} ({p}%)"),
              missing = "no") %>%
  add_ci() %>%
  bold_labels() %>%
  modify_header(label ~ "") %>%
  modify_caption("**Baseline HTN/DM Prevalence**")

```
\newline
\newline
```{r prevalence_ncd_only}
# summarize NCD only prevalence
df.sample %>% 
  select(ncd_status) %>% 
  filter(ncd_status == "HTN & DM") %>%
  separate(ncd_status, c("htn_yes", "dm_yes"), "&") %>% 
  pivot_longer(c("htn_yes", "dm_yes"), values_to  = "ncd_status", names_to = NULL) %>% 
  rbind(
    df.sample %>%  select(ncd_status) %>%  filter(ncd_status %in% c("HTN", "DM"))
  ) %>% 
  mutate(ncd_status = factor(str_trim(ncd_status)) %>%  ff_label("NCD")) %>%
  tbl_summary(statistic = list(all_categorical() ~ "{n} ({p}%)")) %>%
  add_ci() %>%
  bold_labels() %>%
  modify_header(label ~ "") %>%
  modify_caption("**Baseline NCD Prevalence**")

```
\newline
\newline
```{r describe_numeric}
theme_gtsummary_journal("jama")

theme_gtsummary_compact()


df.cont.summary <- df.sample %>% 
  mutate(months_in_gishe = log(months_in_gishe)) %>%
  select(where(is.numeric)) %>%
  skimr::skim() %>%
  mutate(n = nrow(df.sample) - n_missing,
         missingness = round((1 - complete_rate) * 100, 1)) %>%
  data.frame() %>%
  select(-skim_type,
         -skim_variable,
         -n_missing,
         -complete_rate,
         -numeric.hist) %>%
  round(1)

df.cont.summary <- cbind(data.frame(
  var_ = c(
    "HbA1C Result",
    "Weight(kg)",
    "Height(cm)",
    "RBS Result",
    "Months in Gishe",
    "Systolic BP",
    "Diastolic BP"
  )
), df.cont.summary)

colnames(df.cont.summary) <- c("Variable",
                               "Mean",
                               "SD",
                               "Min",
                               "P25",
                               "P50",
                               "P75",
                               "Max",
                               "n",
                               "% Missing")


df.cont.summary %>%
  gt(id = "describe_numeric") %>%
  cols_width(Variable ~ px(150), everything() ~ px(80)) |>
  cols_align(align = "left", columns = everything()) %>%
  tab_header(title = "Overal Description of Numeric Variables Used") %>%
  opt_align_table_header(align = "left") %>%
  opt_css(
    css = "
    #describe_numeric .gt_table {
          border-top: 0;
    }

    #describe_numeric .gt_header th{
          font-weight: 700;
          padding-top: 8px;
          padding-bottom: 8px;
          color: #777;
          font-size: 13px;
    }

    #describe_numeric .gt_col_headings th{
          font-weight: 700;
    }

    #describe_numeric .gt_table_body {
      font-size: 13px;
    }

    #describe_numeric .gt_row {
      padding-top: 1px;
      padding-bottom: 1px;

    }

    "
  )
  
```
\newline
\newline
```{r describe_categorical}

# summarize categorical vars
df.sample %>%
  select(where(is.factor), -ncd_status, -dm_yes, -htn_yes, -arm) %>%
  tbl_summary(missing_text = "Missing", by = hiv_ncd_status) %>%
  add_n() %>%
  bold_labels() %>%
  modify_caption("**Overal Description of Categorical Variables Used**") %>%
  modify_header(label ~ "")

```
\newline
\newline
```{r characteristics_hiv_ncd_status}

# NCD cahracteristics by hiv_ncd_status
df.sample <- df.sample %>% 
  mutate(months_in_gishe = log(months_in_gishe) %>%  ff_label("Months in Gishe"))

df.sample %>% 
  select(-dm_yes, -htn_yes, -ncd_status, -arm, -hba1c_results) %>% 
  tbl_summary(
    by = hiv_ncd_status,
    type = all_continuous() ~ "continuous2",
    statistic = list(
      all_continuous() ~ c(
      "{N_nonmiss}",
      "{mean} ({sd})",
      "{min}, {max}"
    ),
      all_categorical() ~ "{n} ({p}%)"
    ),
    missing = "no") %>%
  add_n() %>%
  add_overall() %>%
  add_p() %>%
  bold_labels() %>% 
  modify_header(label ~ "") %>% 
  modify_caption("**HIV/NCD Characteristics by HIV/NCD Status**")

```

\newline
\newline 
```{r htn_regression}

# remove some variables not needed in the model
model_remove_vars <- df.sample %>%
  select(
    dm_yes,
    htn_yes,
    rbs_result,
    hba1c_results,
    systolic_bp,
    diastolic_bp,
    height,
    weight,
    ncd_status,
    arm,
    wealthindex,
    phq4_summary
  ) %>%
  colnames()

# univariate regression
model_unadj <- df.sample %>% 
  
  select(-all_of(model_remove_vars), -est_monthly_income_2) %>%
  tbl_uvregression(
    method = glm,
    y = hiv_ncd_status,
    exponentiate = TRUE,
    method.args = list(family = binomial(link = "logit")),
    estimate_fun = function(x)
      style_number(x, digits = 2)
  ) %>%
  add_global_p() %>%
  bold_labels()

# multivariate regression
model_adj <- glm(hiv_ncd_status ~ .,
                 (df.sample %>% select(-all_of(model_remove_vars), -est_monthly_income_2)),
                 family = binomial(link = "logit")) %>%
  tbl_regression(
    exponentiate = TRUE,
    estimate_fun = function(x)
      style_number(x, digits = 2)
  ) %>%
  add_global_p() %>%
  add_vif() %>%
  bold_labels()


#combine the tables
tbl_merge(
  tbls = list(model_unadj, model_adj),
  tab_spanner = c("**Unadjusted OR**", "**Adjusted OR**")
) %>%
  modify_header(label ~ "") %>%
  modify_caption("**Logistic regression analysis with HIV/NCD Status as the outcome variable.**") %>%
  modify_footnote(
    update = everything() ~ "*HbA1C Result* was removed because Wilcoxon Rank Sum test could not be calculated.<br>
                  *Systolic BP*, *Diastolic BP* and *RBS Result* were also not included in the model because the outcome variable is a derivative of both.<br>
                  *Height* and *Weight* are accounted for in BMI"
  )

```
\newline
\newline  

### Comparing the predicitive power of model with income 1 categories and income 2 categories  

#### Comparing income 1 and 2 categories. There wouldn't be much difference in the predictive power of the models. We could chose either in terms of levels.  

```{r roc1}

library(ggplot2)
library(pROC)
# use est_monthly_income
data1 <- df.sample %>% select(-all_of(model_remove_vars), -est_monthly_income_2)

# use est_monthly_income_2
data2 <- df.sample %>% select(-all_of(model_remove_vars), -est_monthly_income)

set.seed(1)
sample <- sample(c(TRUE, FALSE), nrow(data1), replace=TRUE, prob=c(0.7,0.3))

train1 <- data1[sample, ]
test1 <- data1[!sample, ]

train2 <- data2[sample, ]
test2 <- data2[!sample, ]


# model with income
model_adj.income1 <- glm(hiv_ncd_status ~ ., data = data1, family = binomial(link = "logit"))

#use model to make predictions on test set
predicted1 <- predict(model_adj.income1, test1, type="response")

#define object to plot
rocobj1 <- roc(test1$hiv_ncd_status, predicted1, smooth=T, ci=T)

# model without income
model_adj.income2 <- glm(hiv_ncd_status ~ ., data = data2, family = binomial(link = "logit"))
#use model to make predictions on test set
predicted2 <- predict(model_adj.income2, test2, type="response")

#define object to plot
rocobj2 <- roc(test2$hiv_ncd_status, predicted2, smooth=T, ci=T)

auc1 <- round(auc(test1$hiv_ncd_status, predicted1),4)
auc2 <- round(auc(test2$hiv_ncd_status, predicted2),4)

#create ROC plots
ggroc(list(`Model WIth Income 1` = rocobj1, `Model WIth Income 2` = rocobj2)) +
  ggtitle(paste0('Model 1 : ROC Curve (AUC = ', auc1, ')', '\n', 'Model 2 : ROC Curve (AUC = ', auc2, ')')) +
  theme_minimal() + 
  labs(color = "") +
  theme(legend.position = "bottom", plot.title = element_text(size=9))

```

####  Comparing a model with income predictor and one without.  

At 25% missing data for income variable( Income 2 in this case), we are slightly better off excluding it than including it. But if we believe it should be in the model, we are not likely to lose much given that the confidence intervals of the estimates with included income accomodates the estimates of the model with excluded income predictor.  

```{r roc2}

library(ggplot2)
library(pROC)


# what if we don't use any income variable?
data3 <- df.sample %>% select(-all_of(model_remove_vars), -est_monthly_income_2, -est_monthly_income)

set.seed(4)
sample <- sample(c(TRUE, FALSE), nrow(data1), replace=TRUE, prob=c(0.7,0.3))

train3 <- data3[sample, ]
test3 <- data3[!sample, ]


# model with no income
model_adj.income3 <- glm(hiv_ncd_status ~ ., data = data3, family = binomial(link = "logit"))

#use model to make predictions on test set
predicted3 <- predict(model_adj.income3, test3, type="response")

#define object to plot
rocobj3 <- roc(test3$hiv_ncd_status, predicted3, smooth=T)


# defeine the confidence intervals
ci.list <- ci.se(rocobj2, specificities = seq(0, 1, l = 25))
dat.ci.list <- data.frame(x = as.numeric(rownames(ci.list)),
             lower = ci.list[, 1],
             upper = ci.list[, 3])

auc3 <- round(auc(test3$hiv_ncd_status, predicted3),4)

#create ROC plots, with CI for the model with income
ggroc(list(`Includes Est. Income` = rocobj2, `Excludes Est. Income` = rocobj3)) + 
  geom_ribbon(data = dat.ci.list, aes(x = x, ymin = lower, ymax = upper), fill = 2,alpha = 0.2, inherit.aes = F) +
  ggtitle(paste0('Includes Est. Income (AUC = ', auc2, ')', '\n', 'Excludes Est. Income (AUC = ', auc3, ')')) +
  theme_minimal() + 
  labs(color = "") +
  theme(legend.position = "bottom", plot.title = element_text(size=9))

```

#### We can then choose to have est. income 2 predictor in model despite missing 25% of the data.  

```{r htn_regression2}

# univariate regression
model_unadj <- df.sample %>%
  select(-all_of(model_remove_vars), -est_monthly_income) %>%
  tbl_uvregression(
    method = glm,
    y = hiv_ncd_status,
    exponentiate = TRUE,
    method.args = list(family = binomial(link = "logit")),
    estimate_fun = function(x)
      style_number(x, digits = 2)
  ) %>%
  add_global_p() %>%
  bold_labels()

# multivariate regression
model_adj <- glm(hiv_ncd_status ~ .,
                 (df.sample %>% select(-all_of(model_remove_vars), -est_monthly_income)),
                 family = binomial(link = "logit")) %>%
  tbl_regression(
    exponentiate = TRUE,
    estimate_fun = function(x)
      style_number(x, digits = 2)
  ) %>%
  add_global_p() %>%
  add_vif() %>%
  bold_labels()


#combine the tables
tbl_merge(
  tbls = list(model_unadj, model_adj),
  tab_spanner = c("**Unadjusted OR**", "**Adjusted OR**")
) %>%
  modify_header(label ~ "") %>%
  modify_caption("**Logistic regression analysis with HIV/NCD Status as the outcome variable and using Est. Monthly Income 2**") %>%
  modify_footnote(
    update = everything() ~ "*HbA1C Result* was removed because Wilcoxon Rank Sum test could not be calculated.<br>
                  *Systolic BP*, *Diastolic BP* and *RBS Result* were also not included in the model because the outcome variable is a derivative of both.<br>
                  *Height* and *Weight* are accounted for in BMI"
  )

```
\newline
\newline





